/**
* Copyright 2013 MIR@MU Project
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package cz.muni.fi.mir.mathmlcanonicalization.modules;
import static cz.muni.fi.mir.mathmlcanonicalization.modules.AbstractModule.MATHMLNS;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.jdom2.Content;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.Text;
import org.jdom2.filter.ContentFilter;
import org.jdom2.filter.ElementFilter;
/**
* Normalize the way to express an function applied to arguments in MathML.
* <div class="simpleTagLabel">Input</div> Well-formed MathML, not processed by
* MrowMinimizer yet
* <div class="simpleTagLabel">Output</div> The original code with:
* <ul>
* <li>normalized Unicode symbols</li>
* <li>unified operators</li>
* <li>no redundant operators</li>
* </ul>
*
* @author David Formanek
*/
public class OperatorNormalizer extends AbstractModule implements DOMModule {
private static final Logger LOGGER = Logger.getLogger(OperatorNormalizer.class.getName());
// properties key names
private static final String REMOVE_EMPTY_OPERATORS = "removeempty";
private static final String OPERATORS_TO_REMOVE = "removeoperators";
private static final String OPERATOR_REPLACEMENTS = "replaceoperators";
private static final String COLON_REPLACEMENT = "colonreplacement";
private static final String NORMALIZATION_FORM = "normalizationform";
private static final String OPERATORS = "operators";
private static final String IDENTIFIERS = "identifiers";
public OperatorNormalizer() {
declareProperty(REMOVE_EMPTY_OPERATORS);
declareProperty(OPERATORS_TO_REMOVE);
declareProperty(OPERATOR_REPLACEMENTS);
declareProperty(COLON_REPLACEMENT);
declareProperty(NORMALIZATION_FORM);
declareProperty(OPERATORS);
declareProperty(IDENTIFIERS);
}
@Override
public void execute(final Document doc) {
if (doc == null) {
throw new NullPointerException("doc");
}
final Element root = doc.getRootElement();
// TODO: convert Unicode superscripts (supX entities) to msup etc.
final String normalizerFormStr = getProperty(NORMALIZATION_FORM);
if (normalizerFormStr.isEmpty()) {
LOGGER.fine("Unicode text normalization is switched off");
} else {
try {
Normalizer.Form normalizerForm = Normalizer.Form.valueOf(normalizerFormStr);
normalizeUnicode(root, normalizerForm);
} catch (IllegalArgumentException ex) {
throw new IllegalArgumentException("Invalid configuration value: "
+ NORMALIZATION_FORM, ex);
}
}
unifyOperators(root);
}
/**
* Converts bad identifiers to operators, removes redundant and replaces
*/
private void unifyOperators(final Element ancestor) {
assert ancestor != null;
final Set<String> toRemove = getPropertySet(OPERATORS_TO_REMOVE);
final Map<String, String> replaceMap = getPropertyMap(OPERATOR_REPLACEMENTS);
if (!getProperty(COLON_REPLACEMENT).isEmpty()) {
replaceMap.put(":", getProperty(COLON_REPLACEMENT));
}
final Set<String> operators = getPropertySet(OPERATORS);
operators.addAll(toRemove);
operators.addAll(replaceMap.keySet());
operators.addAll(replaceMap.values());
replaceIdentifiers(ancestor, operators);
if (isEnabled(REMOVE_EMPTY_OPERATORS) || !toRemove.isEmpty()) {
removeSpareOperators(ancestor, toRemove);
} else {
LOGGER.fine("No operators set for removal");
}
if (replaceMap.isEmpty()) {
LOGGER.fine("No operators set to replace");
} else {
replaceOperators(ancestor, replaceMap);
}
final Set<String> identifiers = getPropertySet(IDENTIFIERS);
operatorsToIdentifiers(ancestor, identifiers);
}
private void normalizeUnicode(final Element ancestor, final Normalizer.Form form) {
assert ancestor != null && form != null;
final List<Text> texts = new ArrayList<Text>();
final ContentFilter textFilter = new ContentFilter(ContentFilter.TEXT);
for (Content text : ancestor.getContent(textFilter)) {
texts.add((Text) text);
}
for (Element element : ancestor.getDescendants(new ElementFilter())) {
for (Content text : element.getContent(textFilter)) {
texts.add((Text) text);
}
}
for (Text text : texts) {
if (Normalizer.isNormalized(text.getText(), form)) {
continue;
}
final String normalizedString = Normalizer.normalize(text.getText(), form);
LOGGER.log(Level.FINE, "Text ''{0}'' normalized to ''{1}''",
new Object[]{text.getText(), normalizedString});
text.setText(normalizedString);
assert Normalizer.isNormalized(text.getText(), form);
}
}
private void removeSpareOperators(final Element element, final Collection<String> spareOperators) {
assert element != null && spareOperators != null && !spareOperators.isEmpty();
final List<Element> children = element.getChildren();
for (int i = 0; i < children.size(); i++) {
final Element actual = children.get(i); // actual element
if (isOperator(actual)) {
//Keep special case where asterisk is by itself in a subscript
String parent = actual.getParentElement().getName();
if (isSpareOperator(actual, spareOperators) && !(parent.equals("msub"))
&& !(parent.equals("msubsup") && !(parent.equals("msup")))) {
actual.detach();
i--; // move iterator back after detaching so it points to next element
LOGGER.log(Level.FINE, "Operator {0} removed", actual);
}
} else {
removeSpareOperators(actual, spareOperators);
}
}
}
private boolean isSpareOperator(final Element operator, final Collection<String> spareOperators) {
assert operator != null && spareOperators != null && isOperator(operator);
return (isEnabled(REMOVE_EMPTY_OPERATORS) && operator.getText().isEmpty())
|| (spareOperators.contains(operator.getTextTrim()));
}
private void replaceOperators(final Element element, final Map<String, String> replacements) {
assert element != null && replacements != null;
List<Element> operatorsToReplace = new ArrayList<Element>();
for (Element operator : element.getDescendants(new ElementFilter(OPERATOR, MATHMLNS))) {
if (replacements.containsKey(operator.getTextTrim())) {
operatorsToReplace.add(operator);
}
}
for (Element operator : operatorsToReplace) {
final String oldOperator = operator.getTextTrim();
final String newOperator = replacements.get(oldOperator);
operator.setText(newOperator);
LOGGER.log(Level.FINE, "Operator ''{0}'' was replaced by ''{1}''",
new Object[]{oldOperator, newOperator});
}
}
private void replaceIdentifiers(final Element ancestor, final Set<String> operators) {
assert ancestor != null && operators != null;
final List<Element> toReplace = new ArrayList<Element>();
for (Element element : ancestor.getDescendants(new ElementFilter(IDENTIFIER, MATHMLNS))) {
// TODO: control whole ranges of symbols rather than listed ones
if (operators.contains(element.getTextTrim())) {
toReplace.add(element);
}
}
for (Element element : toReplace) {
LOGGER.log(Level.FINE, "Creating an operator from {0}", element.getText());
replaceElement(element, OPERATOR);
}
}
private void operatorsToIdentifiers(final Element ancestor, final Set<String> identifiers) {
assert ancestor != null && identifiers != null;
final List<Element> toReplace = new ArrayList<Element>();
for (Element element : ancestor.getDescendants(new ElementFilter(OPERATOR, MATHMLNS))) {
if (identifiers.contains(element.getTextTrim())) {
toReplace.add(element);
}
}
for (Element element : toReplace) {
LOGGER.log(Level.FINE, "Creating an identifier from {0}", element.getText());
replaceElement(element, IDENTIFIER);
}
}
private Map<String, String> getPropertyMap(final String property) {
assert property != null && isProperty(property);
final Map<String, String> propertyMap = new HashMap<String, String>();
final String[] mappings = getProperty(property).split(" ");
for (String mapping : mappings) {
final String[] mappingPair = mapping.split(":", 2);
if (mappingPair.length != 2) {
throw new IllegalArgumentException("property has wrong format");
}
propertyMap.put(mappingPair[0], mappingPair[1]);
}
return propertyMap;
}
}